In this Notebook, the tweet archive for each library is harvested and saved to a json-file as [screen_name]_TimeLine _[datestamp].json.
Due to the range limit of the twitter api, only the last 3,200 tweets can be harvested for each screen_name.
In [5]:
# Code from MTSW 2Ed.
# cf. https://github.com/ptwobrussell/Mining-the-Social-Web-2nd-Edition
import twitter
def oauth_login():
# XXX: Go to http://twitter.com/apps/new to create an app and get values
# for these credentials that you'll need to provide in place of these
# empty string values that are defined as placeholders.
# See https://dev.twitter.com/docs/auth/oauth for more information
# on Twitter's OAuth implementation.
CONSUMER_KEY =
CONSUMER_SECRET =
OAUTH_TOKEN =
OAUTH_TOKEN_SECRET =
auth = twitter.oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET,
CONSUMER_KEY, CONSUMER_SECRET)
twitter_api = twitter.Twitter(auth=auth)
return twitter_api
# Sample usage
twitter_api = oauth_login()
import sys
import time
from urllib2 import URLError
from httplib import BadStatusLine
import json
import twitter
def make_twitter_request(twitter_api_func, max_errors=10, *args, **kw):
# A nested helper function that handles common HTTPErrors. Return an updated
# value for wait_period if the problem is a 500 level error. Block until the
# rate limit is reset if it's a rate limiting issue (429 error). Returns None
# for 401 and 404 errors, which requires special handling by the caller.
def handle_twitter_http_error(e, wait_period=2, sleep_when_rate_limited=True):
if wait_period > 3600: # Seconds
print >> sys.stderr, 'Too many retries. Quitting.'
raise e
# See https://dev.twitter.com/docs/error-codes-responses for common codes
if e.e.code == 401:
print >> sys.stderr, 'Encountered 401 Error (Not Authorized)'
return None
elif e.e.code == 404:
print >> sys.stderr, 'Encountered 404 Error (Not Found)'
return None
elif e.e.code == 429:
print >> sys.stderr, 'Encountered 429 Error (Rate Limit Exceeded)'
if sleep_when_rate_limited:
print >> sys.stderr, "Retrying in 15 minutes...ZzZ..."
sys.stderr.flush()
time.sleep(60*15 + 5)
print >> sys.stderr, '...ZzZ...Awake now and trying again.'
return 2
else:
raise e # Caller must handle the rate limiting issue
elif e.e.code in (500, 502, 503, 504):
print >> sys.stderr, 'Encountered %i Error. Retrying in %i seconds' % \
(e.e.code, wait_period)
time.sleep(wait_period)
wait_period *= 1.5
return wait_period
else:
raise e
# End of nested helper function
wait_period = 2
error_count = 0
while True:
try:
return twitter_api_func(*args, **kw)
except twitter.api.TwitterHTTPError, e:
error_count = 0
wait_period = handle_twitter_http_error(e, wait_period)
if wait_period is None:
return
except URLError, e:
error_count += 1
time.sleep(wait_period)
wait_period *= 1.5
print >> sys.stderr, "URLError encountered. Continuing."
if error_count > max_errors:
print >> sys.stderr, "Too many consecutive errors...bailing out."
raise
except BadStatusLine, e:
error_count += 1
time.sleep(wait_period)
wait_period *= 1.5
print >> sys.stderr, "BadStatusLine encountered. Continuing."
if error_count > max_errors:
print >> sys.stderr, "Too many consecutive errors...bailing out."
raise
def harvest_user_timeline(twitter_api, screen_name=None, user_id=None, max_results=3200):
assert (screen_name != None) != (user_id != None), \
"Must have screen_name or user_id, but not both"
kw = { # Keyword args for the Twitter API call
'count': 200,
'trim_user': 'true',
'include_rts' : 'true',
'since_id' : 1
}
if screen_name:
kw['screen_name'] = screen_name
else:
kw['user_id'] = user_id
max_pages = 16
results = []
tweets = make_twitter_request(twitter_api.statuses.user_timeline, **kw)
if tweets is None: # 401 (Not Authorized) - Need to bail out on loop entry
tweets = []
results += tweets
print >> sys.stderr, 'Fetched %i tweets' % len(tweets)
page_num = 1
# Many Twitter accounts have fewer than 200 tweets so you don't want to enter
# the loop and waste a precious request if max_results = 200.
# Note: Analogous optimizations could be applied inside the loop to try and
# save requests. e.g. Don't make a third request if you have 287 tweets out of
# a possible 400 tweets after your second request. Twitter does do some
# post-filtering on censored and deleted tweets out of batches of 'count', though,
# so you can't strictly check for the number of results being 200. You might get
# back 198, for example, and still have many more tweets to go. If you have the
# total number of tweets for an account (by GET /users/lookup/), then you could
# simply use this value as a guide.
if max_results == kw['count']:
page_num = max_pages # Prevent loop entry
while page_num < max_pages and len(tweets) > 0 and len(results) < max_results:
# Necessary for traversing the timeline in Twitter's v1.1 API:
# get the next query's max-id parameter to pass in.
# See https://dev.twitter.com/docs/working-with-timelines.
kw['max_id'] = min([ tweet['id'] for tweet in tweets]) - 1
tweets = make_twitter_request(twitter_api.statuses.user_timeline, **kw)
results += tweets
print >> sys.stderr, 'Fetched %i tweets' % (len(tweets),)
page_num += 1
print >> sys.stderr, 'Done fetching tweets'
return results[:max_results]
# Save to MongoDB with save_to_mongo or a local file with save_json...
In [6]:
#import & export CSV
import csv
def impCSV(input_file):
'''
input_file = csv with keys: "URL", "Twitter"
output = list of dictionaries
'''
f = open(input_file, 'r')
d = csv.DictReader(f)
LoD = [] # list of dictionaries
for row in d:
LoD.append(row)
f.close()
return LoD
def saveTimelineAsJSON(username):
import io, json, datetime
datestamp = datetime.datetime.now().strftime('%Y-%m-%d')
filename = username + '_TimeLine_' + datestamp + '.json'
timeLine = harvest_user_timeline(twitter_api, screen_name=username) # calling the harvesting function
with io.open(filename, 'w', encoding='utf-8') as f:
f.write(unicode(json.dumps(timeLine, ensure_ascii=False)))
print 'Tweet archive was saved as ' + filename + '.'
def tweetArchive(Twitterfile):
'''
input: the NatBibTwitter.csv etc. filenames
output: JSON files with the Tweet archives of each library (last 3,200 Tweets)
Filenames will be <twitter handle>_TimeLine_<datestamp>.json
'''
f = impCSV(Twitterfile)
for e in f:
n = e['Twitter'] # get Twitter handel of the library
saveTimelineAsJSON(n) # harvesting the archive and saving it as json file
In [9]:
tweetArchive('NatBibTwitter2.csv')
In [7]:
tweetArchive('UniBibTwitter2.csv')
In [8]:
tweetArchive('OeBibTwitter2.csv')
In [ ]: